# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
%matplotlib inline
# Load the dataset into pandas dataframe
df = pd.read_csv('housing.csv')
df.head(10)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
| 5 | -122.25 | 37.85 | 52.0 | 919.0 | 213.0 | 413.0 | 193.0 | 4.0368 | 269700.0 | NEAR BAY |
| 6 | -122.25 | 37.84 | 52.0 | 2535.0 | 489.0 | 1094.0 | 514.0 | 3.6591 | 299200.0 | NEAR BAY |
| 7 | -122.25 | 37.84 | 52.0 | 3104.0 | 687.0 | 1157.0 | 647.0 | 3.1200 | 241400.0 | NEAR BAY |
| 8 | -122.26 | 37.84 | 42.0 | 2555.0 | 665.0 | 1206.0 | 595.0 | 2.0804 | 226700.0 | NEAR BAY |
| 9 | -122.25 | 37.84 | 52.0 | 3549.0 | 707.0 | 1551.0 | 714.0 | 3.6912 | 261100.0 | NEAR BAY |
# Check the last 10 observation of the dataset
df.tail(10)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 20630 | -121.32 | 39.29 | 11.0 | 2640.0 | 505.0 | 1257.0 | 445.0 | 3.5673 | 112000.0 | INLAND |
| 20631 | -121.40 | 39.33 | 15.0 | 2655.0 | 493.0 | 1200.0 | 432.0 | 3.5179 | 107200.0 | INLAND |
| 20632 | -121.45 | 39.26 | 15.0 | 2319.0 | 416.0 | 1047.0 | 385.0 | 3.1250 | 115600.0 | INLAND |
| 20633 | -121.53 | 39.19 | 27.0 | 2080.0 | 412.0 | 1082.0 | 382.0 | 2.5495 | 98300.0 | INLAND |
| 20634 | -121.56 | 39.27 | 28.0 | 2332.0 | 395.0 | 1041.0 | 344.0 | 3.7125 | 116800.0 | INLAND |
| 20635 | -121.09 | 39.48 | 25.0 | 1665.0 | 374.0 | 845.0 | 330.0 | 1.5603 | 78100.0 | INLAND |
| 20636 | -121.21 | 39.49 | 18.0 | 697.0 | 150.0 | 356.0 | 114.0 | 2.5568 | 77100.0 | INLAND |
| 20637 | -121.22 | 39.43 | 17.0 | 2254.0 | 485.0 | 1007.0 | 433.0 | 1.7000 | 92300.0 | INLAND |
| 20638 | -121.32 | 39.43 | 18.0 | 1860.0 | 409.0 | 741.0 | 349.0 | 1.8672 | 84700.0 | INLAND |
| 20639 | -121.24 | 39.37 | 16.0 | 2785.0 | 616.0 | 1387.0 | 530.0 | 2.3886 | 89400.0 | INLAND |
# Check the shape of the dataset
df.shape
(20640, 10)
# Check the data type
df.dtypes
longitude float64 latitude float64 housing_median_age float64 total_rooms float64 total_bedrooms float64 population float64 households float64 median_income float64 median_house_value float64 ocean_proximity object dtype: object
# Check for null values in the dataset
df.isna().sum()
longitude 0 latitude 0 housing_median_age 0 total_rooms 0 total_bedrooms 207 population 0 households 0 median_income 0 median_house_value 0 ocean_proximity 0 dtype: int64
# Drop null values in the dataset
df.dropna(axis=0, inplace=True)
# check if null values still exist in the dataset
df.isna().sum()
longitude 0 latitude 0 housing_median_age 0 total_rooms 0 total_bedrooms 0 population 0 households 0 median_income 0 median_house_value 0 ocean_proximity 0 dtype: int64
# Change the datatype of some features from `float` to `int`
observations = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households']
for observation in observations:
df[observation] = df[observation].astype('int')
# Test
df.dtypes
longitude float64 latitude float64 housing_median_age int32 total_rooms int32 total_bedrooms int32 population int32 households int32 median_income float64 median_house_value float64 ocean_proximity object dtype: object
# Statistical summary of the dataset
df.describe()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 | 20433.000000 |
| mean | -119.570689 | 35.633221 | 28.633094 | 2636.504233 | 537.870553 | 1424.946949 | 499.433465 | 3.871162 | 206864.413155 |
| std | 2.003578 | 2.136348 | 12.591805 | 2185.269567 | 421.385070 | 1133.208490 | 382.299226 | 1.899291 | 115435.667099 |
| min | -124.350000 | 32.540000 | 1.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 0.499900 | 14999.000000 |
| 25% | -121.800000 | 33.930000 | 18.000000 | 1450.000000 | 296.000000 | 787.000000 | 280.000000 | 2.563700 | 119500.000000 |
| 50% | -118.490000 | 34.260000 | 29.000000 | 2127.000000 | 435.000000 | 1166.000000 | 409.000000 | 3.536500 | 179700.000000 |
| 75% | -118.010000 | 37.720000 | 37.000000 | 3143.000000 | 647.000000 | 1722.000000 | 604.000000 | 4.744000 | 264700.000000 |
| max | -114.310000 | 41.950000 | 52.000000 | 39320.000000 | 6445.000000 | 35682.000000 | 6082.000000 | 15.000100 | 500001.000000 |
The following were observed in the dataset:
ocean_proximity) while the rest are numeric.I want to know how location affects the "median house value".
I anticipate that there will be a significant impact on apartment pricing depending on the apartments' closeness to watersides, their longitude and latitude, and their number of rooms.
# Check 10 samples of the dataset
df.sample(10)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9165 | -118.43 | 34.42 | 13 | 3600 | 580 | 1799 | 576 | 6.2971 | 218300.0 | <1H OCEAN |
| 17105 | -122.22 | 37.46 | 13 | 2888 | 546 | 1182 | 504 | 6.0255 | 409300.0 | NEAR OCEAN |
| 12029 | -117.46 | 33.93 | 16 | 4112 | 880 | 2821 | 857 | 3.0122 | 114700.0 | INLAND |
| 16355 | -121.32 | 38.03 | 25 | 2474 | 513 | 1947 | 524 | 2.5742 | 98400.0 | INLAND |
| 11700 | -117.97 | 33.88 | 9 | 1344 | 279 | 530 | 265 | 5.0731 | 185100.0 | <1H OCEAN |
| 17728 | -121.79 | 37.32 | 6 | 2850 | 561 | 2160 | 581 | 5.5336 | 241900.0 | <1H OCEAN |
| 6273 | -117.94 | 34.03 | 35 | 1375 | 249 | 1015 | 239 | 4.0521 | 151800.0 | <1H OCEAN |
| 2550 | -124.16 | 40.78 | 43 | 2241 | 446 | 932 | 395 | 2.9038 | 82000.0 | NEAR OCEAN |
| 12929 | -121.21 | 38.66 | 15 | 6940 | 1019 | 2829 | 990 | 5.4889 | 232300.0 | INLAND |
| 13286 | -117.65 | 34.08 | 35 | 2621 | 391 | 1074 | 391 | 4.7176 | 166400.0 | INLAND |
base_color = sb.color_palette()[0]
sb.countplot(data=df, x='ocean_proximity', color=base_color)
plt.title('Ocean Proximity Bars');
The majority of the homes in the dataset are closest to the beach in under an hour, followed by those in the interior.
# Define a function to plot Histogram
def histogram(DataFrame, x_value, x_label, y_label, title, nbins):
plt.hist(data=DataFrame, x=x_value, bins=nbins)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title);
# Histogram of House Median Age
histogram(df,
'housing_median_age',
'House Median Age',
'Frequency',
'House Median Age',
10)
The distribution has a peak about above 30 and a small skew to the left. Most of the homes are older than the average age of ten.
# Histogram of House Value
histogram(df,
'median_house_value',
'House Value',
'Frequency',
'House Value',
10)
The House value is skewed to the right with a peak just after 100,000, this is interesting. Is like most of the houses that are over 10 years old are not that costly or is more affordable. Lets look at the distribution of their income below.
# Histogram of Household Income
histogram(df, 'median_income', 'Income [Thousand USD]', 'Frequency', 'Income', 20)
The Income is also largely skewed to the right.
The rooms are skewed to the right and compact between 0 and 5000.
histogram(df, 'total_bedrooms', 'Bedooms', 'Frequency', 'Bedrooms Distribution', 100)
histogram(df, 'total_rooms', 'Rooms', 'Frequency', 'Rooms Distribution', 100)
The bedrooms are also skwed to the right.
histogram(df, 'population', 'Population', 'Frequency', 'Population', 100)
The population is skewed to the right just like the rooms and bedrooms.
histogram(df, 'households', 'Households', 'Frequency', 'Households', 100)
The households follows similar pattern as the rooms, bedrooms and population. It is skwed to the right and have similar characterstics
Most of the houses are valued within the range of 50,000 to 250,000 with peak around 200,000.
The income and the house value is also skewed to the right. Possibly the house income is proportion to the house value.
First, lets take a look at how the numeric features corelate
df.columns
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value', 'ocean_proximity'],
dtype='object')
# numeric variables for corellation
numeric_vars = ['housing_median_age', 'total_rooms', 'total_bedrooms',
'population', 'households', 'median_income', 'median_house_value']
# Corellation
plt.figure(figsize=(6,6))
sb.heatmap(df[numeric_vars].corr(), annot = True,
cmap = 'vlag_r', center = 0);
# Define a function for scatter plot.
def scatter(DataFrame, x_value, x_label, y_value, y_label, title):
sb.scatterplot(data=DataFrame, x=x_value, y=y_value)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title(title);
# Scatter plot of house value and income
scatter(df, 'median_income', 'Income [Thousand USD]', 'median_house_value', 'House Value [USD]', 'Income vs House Value')
There is a strong correlation between income and house value which confirms our assumption.
# Scatter plot of house age and house value
scatter(df, 'housing_median_age', 'Housing Age [Years]',
'median_house_value', 'House Value [USD]', 'Housing Age vs House Value')
There is no correlation between age of the houses and their prices. House age is not a determining factor when considering its price.
sb.violinplot(data=df, x='ocean_proximity', y='median_house_value', color=base_color, inner='box')
plt.xticks(rotation=45);
The median of house value on the Island is relatively high compared to other location. Could that mean there are more rooms on the Island or its just relatively on the high side. Lets look at the distribution of total rooms ineach proximity.
# Derive the density of house value and rooms
df['density'] = df['median_house_value'] / df['total_rooms']
# Groupby ocean proximity and five the average of the density
df1 = df.groupby('ocean_proximity')['density'].mean()
df1
ocean_proximity <1H OCEAN 160.283393 INLAND 117.289168 ISLAND 287.767762 NEAR BAY 216.399820 NEAR OCEAN 181.834438 Name: density, dtype: float64
# Bar chat of Density and Proximity
df1.plot.bar()
plt.xlabel('Ocean Proximity')
plt.ylabel('Density')
plt.title('Density vs Proximity');
Invariably, houses on the Island are the most expensive.
The plot confirms that household income strongly correlate with the value of the house. Generally, people purchage what they can afford. The age of the house is not a determining factor that can be considered when purchasing a house.
The plot demonstrates the close relationship between household income and home value. People often buy what they can afford. The age of the property does not matter when buying a house.
The majority of housing costs on the island are fairly high, however there is a reasonable distribution for homes along the bay and the ocean. The value of the homes is unrelated to the age of the homes.
fig = px.scatter_mapbox(df,
lat='latitude',
lon='longitude',
center={'lat':37.09, 'lon':-121},
height=600,
width=600,
color='median_house_value',
hover_data=['ocean_proximity'])
fig.update_layout(mapbox_style='open-street-map', title='Housing Price and Location')
fig.show()
The closer the houses are to the ocean or the Bay Area, the higher the prices.
g=sb.PairGrid(data=df, vars=numeric_vars)
g.map_diag(sb.histplot)
g.map_offdiag(plt.scatter);
This confirms that price of houses close to the ocean are expensive.
The previous assumptions were strengthened.
The following are the conclusion derived from the analysis: